We want to look at real datasets to simulate realistic datasets.
data("cortical")
pathLabels = "~/Documents/BRAIN/gitrepo/brainData/analysis/results_160705/cluster_labels.txt"
labels <- read.table(pathLabels, stringsAsFactors = FALSE, sep='\t',
comment.char = "%")
prefilter <- cortical[,rownames(labels[!(labels$ClusterMergedLabel %in% c("-1", "m2", "m4", "m5", "m6")),])]
filter <- apply(assay(prefilter)>10, 1, sum)>=10
postfilter <- prefilter[filter,]
batch <- droplevels(colData(postfilter)$MD_c1_run_id)
qc <- as.matrix(colData(postfilter)[,1:14])
qcpca <- prcomp(qc, center=TRUE, scale.=TRUE)
mod <- model.matrix(~ batch + qcpca$x[, 1:2])
bio <- factor(labels[!(labels$ClusterMergedLabel %in% c("-1", "m2", "m4", "m5", "m6")), "ClusterMergedColor"])
postfilter = assay(postfilter)
vars <- rowVars(log1p(postfilter))
names(vars) <- rownames(postfilter)
vars <- sort(vars, decreasing = TRUE)
core <- postfilter[names(vars)[1:1000],]
detection_rate <- colSums(core>0)
coverage <- colSums(core)
We only focus on the cells that passed the QC filters (by the original authors) and that were part of the “core” clusters. We will color-code the cells by either known cell type or by inferred cluster (inferred in the original study). Select cells, remove ERCC spike-ins, filter out the genes that do not have at least 10 counts in at least 10 cells. Number of retained genes:
print(sum(filter))
## [1] 7591
To speed up the computations, I will focus on the top 1,000 most variable genes. IS IT A GOOD IDEA?
mean = apply(log1p(assay(prefilter)), 1, function(x) mean(x[x!=0]))
plot(mean, rowMeans(assay(prefilter) == 0), xlim = c(1,11), ylim = c(0,1),
xlab = "Mean Expression", ylab = "Dropout Rate",
main = 'no filtering')
pal =colorRampPalette(c("black","black", "red","yellow"), space="rgb")
par(mfrow = c(1,3))
smoothScatter(mean, rowMeans(assay(prefilter) == 0), xlim = c(0,8),
nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
xlab = "Mean Expression", ylab = "Dropout Rate",
colramp =pal, main = 'no filtering')
mean = apply(log1p(postfilter), 1, function(x) mean(x[x!=0]))
smoothScatter(mean, rowMeans(postfilter == 0),xlim = c(1,11),
nbin = 256, nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
xlab = "Mean Expression", ylab = "Dropout Rate",
colramp = pal, main = 'after filtering')
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
smoothScatter(mean, rowMeans(core == 0),xlim = c(1,11),
nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
xlab = "Mean Expression", ylab = "Dropout Rate",
colramp = pal, main = '1,000 most variable genes')
Pink and glia cells have been removed. Fit data with K = 3, V intercepts in both Mu and Pi, commondispersion = FALSE, with X accounting for batch, qc.
print(system.time(zinb <- zinbFit(core, ncores = 3, K = 3, X = mod,
commondispersion = FALSE)))
## user system elapsed
## 693.297 313.229 425.589
If we simulate W from real data, it will look like that.
pairs(zinb@W, col=cols[bio], pch=19, main="W\ncolor = analysis results 160705")
gamma_mu = zinb@gamma_mu[1,]
gamma_pi = zinb@gamma_pi[1,]
df <- data.frame(gamma_mu = gamma_mu,
gamma_pi = gamma_pi,
detection_rate = detection_rate,
coverage = coverage)
pairs(df, pch=19, col=cols[bio])
print(cor(df, method="spearman"))
## gamma_mu gamma_pi detection_rate coverage
## gamma_mu 1.0000000 -0.2439368 0.5252101 0.9883636
## gamma_pi -0.2439368 1.0000000 -0.8470762 -0.2861173
## detection_rate 0.5252101 -0.8470762 1.0000000 0.5482978
## coverage 0.9883636 -0.2861173 0.5482978 1.0000000
gamma = data.frame(gamma_mu = gamma_mu, gamma_pi = gamma_pi, celltype = bio)
gamma = melt(gamma)
ggplot(gamma, aes(x = value)) +
geom_histogram(aes(y = ..density..), bins = 20, col = 'gray') +
geom_density(col= 'blue', size = .5) +
facet_grid(~ variable) + xlab('gamma')
ggplot(gamma, aes(x = variable, y = value)) + xlab('') + theme_bw() +
geom_boxplot() + coord_flip() + facet_grid(~ variable, scales = 'free') +
theme_bw() + ylab('gamma') + scale_x_discrete(breaks = c('', ''), drop=FALSE)
ggplot(gamma, aes(value, fill = celltype)) + geom_density(alpha = 0.2) +
facet_grid(~ variable) + xlab('gamma') + theme_bw()
Because we accounting for batch and qc in X, beta has M = 25 rows.
beta_mu = zinb@beta_mu[1,]
beta_pi = zinb@beta_pi[1,]
df <- data.frame(beta_mu_intercept = beta_mu,
beta_pi_intercept = beta_pi)
pairs(df, pch=19, col=cols[bio])
print(cor(df, method="spearman"))
## beta_mu_intercept beta_pi_intercept
## beta_mu_intercept 1.0000000 -0.3610809
## beta_pi_intercept -0.3610809 1.0000000
beta = data.frame(beta_mu_intercept = beta_mu, beta_pi_intercept = beta_pi)
beta = melt(beta)
ggplot(beta, aes(x = value)) +
geom_histogram(aes(y = ..density..), bins = 100, col = 'gray') +
geom_density(col= 'blue', size = .5) +
facet_grid(~ variable, scales = 'free') + xlab('beta')
ggplot(beta, aes(x = variable, y = value)) + theme_bw() + xlab('') +
geom_boxplot() + facet_grid(~ variable, scales = 'free') + coord_flip() +
theme_bw() + ylab('beta') + scale_x_discrete(breaks = c('', ''), drop=FALSE)
# remove outliers
max = max(quantile(beta_pi, 0.1), quantile(beta_mu, 0.1))
min = min(quantile(beta_pi, 0.1), quantile(beta_mu, 0.1))
ggplot(beta, aes(x = variable, y = value)) + theme_bw() + xlab('') +
geom_boxplot(outlier.shape = NA) +
facet_grid(~ variable, scales = 'free') + coord_flip() +
theme_bw() + ylab('beta removing outliers') +
scale_x_discrete(breaks = c('', ''), drop=FALSE) +
scale_y_continuous(limits = c(min, max))
## Warning: Removed 1314 rows containing non-finite values (stat_boxplot).
pairs(t(zinb@alpha_mu))
pairs(t(zinb@alpha_pi))
df <- data.frame(alpha_mu_1 = zinb@alpha_mu[1, ],
alpha_mu_2 = zinb@alpha_mu[2, ],
alpha_mu_3 = zinb@alpha_mu[3, ],
alpha_pi_1 = zinb@alpha_pi[1, ],
alpha_pi_2 = zinb@alpha_pi[2, ],
alpha_pi_3 = zinb@alpha_pi[3, ])
pairs(df, pch=19)
print(cor(df, method="spearman"))
## alpha_mu_1 alpha_mu_2 alpha_mu_3 alpha_pi_1
## alpha_mu_1 1.000000000 -0.002558535 -0.118975103 -0.300629485
## alpha_mu_2 -0.002558535 1.000000000 -0.007490167 -0.051727888
## alpha_mu_3 -0.118975103 -0.007490167 1.000000000 0.049573562
## alpha_pi_1 -0.300629485 -0.051727888 0.049573562 1.000000000
## alpha_pi_2 -0.020078504 -0.104977173 -0.067802696 -0.003056859
## alpha_pi_3 0.141028173 -0.090183390 -0.307602548 -0.040068676
## alpha_pi_2 alpha_pi_3
## alpha_mu_1 -0.020078504 0.14102817
## alpha_mu_2 -0.104977173 -0.09018339
## alpha_mu_3 -0.067802696 -0.30760255
## alpha_pi_1 -0.003056859 -0.04006868
## alpha_pi_2 1.000000000 0.08323482
## alpha_pi_3 0.083234819 1.00000000
Dispersion was estimated on FQ norm counts using function estimateDisp from edgeR. Dispersions estimated using edgeR and ZINB does not seem to be on the same scale. I think it is because I estimate dispersion on (normalized) counts with edgeR, but (if I understood well) zinb estimate the dispersion on log1p(counts). We should rely on values of dispersion from edgeR as we are going to simulates the counts with \[Y_{i,j} \sim NB(\mu_{i,j}, \phi_j)\]
par(mfrow=c(1,1))
set = newSeqExpressionSet(core)
fq = betweenLaneNormalization(set, which = "full", offset = T)
disp = estimateDisp(counts(fq), offset = -offst(fq))
## Design matrix not provided. Switch to the classic mode.
plot(disp$tagwise.dispersion, zinb@zeta, ylab = 'zinb dispersion',
xlab = 'edgeR tagwise dispersion', main = 'Dispersion')
print(cor(disp$tagwise.dispersion, zinb@zeta, method="spearman"))
## [1] -0.7247025
plot(density(disp$tagwise.dispersion), main = 'edgeR tagwise dispersion')
par(mfrow=c(1, 2))
detection_rate <- colMeans(core>0)
coverage <- colSums(core)
plot(rowMeans(getPi(zinb)), detection_rate, xlab="Average estimated Pi", ylab="Detection Rate for each cell", pch=19, col=cols[bio], ylim = c(0, 1))
plot(rowMeans(log1p(getMu(zinb))), coverage, xlab="Average estimated log Mu", ylab="Coverage", pch=19, col=cols[bio])
par(mfrow=c(1, 3))
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
smoothScatter(mean, rowMeans(core == 0),xlim = c(2,8),
nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
xlab = "Mean Expression", ylab = "Dropout Rate",
colramp = pal, main = 'True')
plot(colMeans(log1p(getMu(zinb))), colMeans(getPi(zinb)),
xlab = "Estimated Mean Expression", xlim = c(2,8),
ylab = "Estimated Dropout Rate", pch=19, col=cols[bio],
ylim = c(0, 1), main = 'Estimated')
smoothScatter(colMeans(log1p(getMu(zinb))), colMeans(getPi(zinb)), nbin = 256,
nrpoints=Inf, pch="", cex=.7, xlim = c(2,8),
xlab = "Estimated Mean Expression", main = 'Estimated',
ylab = "Estimated Dropout Rate",ylim = c(0,1),
colramp = pal)